notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
from collections import Counter

#import colorlover as cl

from IPython.display import HTML, display

from chorogrid import Colorbin, Chorogrid



In [2]:

    
sns.set_context("poster")
sns.set_style("ticks")



In [3]:

    
TOPIC_MAPPING={
    "GunControl": "Gun Control",
    "Privacy": "Privacy",
    "Vaccine": "Vaccine",
    "ChildEducation": "Child Education",
    "SkinDamage": "Skin Damage",
    "SeatBelt": "Seat Belt"
}
topic_order=["Gun Control", "Privacy", "Vaccine",
             "Child Education", "Skin Damage", "Seat Belt"]
df = pd.read_hdf("FINAL_ANALYSIS_DATA.h5", "final_data").rename(columns={
        #u'is_controvertial': u'is_controversial'
    }).assign(
    topic_name=lambda x: x.topic_name.apply(lambda k: TOPIC_MAPPING[k.split('/')[0]]),
)
NON_STATES = set(["UNK", "USA", "AS", "DC", "GU",
              "MP", "PR", "VI"])


STATE_POPULATIONS="""4863300.00	AL
741894.00	AK
6931071.00	AZ
2988248.00	AR
39250017.00	CA
5540545.00	CO
3576452.00	CT
952065.00	DE
681170.00	DC
20612439.00	FL
10310371.00	GA
1428557.00	HI
1683140.00	ID
12801539.00	IL
6633053.00	IN
3134693.00	IA
2907289.00	KS
4436974.00	KY
4681666.00	LA
1331479.00	ME
6016447.00	MD
6811779.00	MA
9928300.00	MI
5519952.00	MN
2988726.00	MS
6093000.00	MO
1042520.00	MT
1907116.00	NE
2940058.00	NV
1334795.00	NH
8944469.00	NJ
2081015.00	NM
19745289.00	NY
10146788.00	NC
757952.00	ND
11614373.00	OH
3923561.00	OK
4093465.00	OR
12784227.00	PA
1056426.00	RI
4961119.00	SC
865454.00	SD
6651194.00	TN
27862596.00	TX
3051217.00	UT
624594.00	VT
8411808.00	VA
7288000.00	WA
1831102.00	WV
5778708.00	WI
585501.00	WY
""".splitlines()

STATE_POPULATIONS = {k:float(v) for v,k in map(lambda x: x.split('\t'), STATE_POPULATIONS)}

CHOROGRID_STATES_FILE='/content/Code/smishra8/chorogrid/chorogrid/databases/usa_states.csv'



In [4]:

    
STATE_POPULATIONS["AZ"]









    Out[4]:





6931071.0



In [5]:

    
df.columns









    Out[5]:





Index([          u'Author',       u'City/Urban',  u'City/Urban Area',
               u'Contents',             u'Date',        u'Followers',
              u'Following',             u'GUID',           u'Gender',
                   u'Name',            u'Posts',              u'RT?',
           u'State/Region',              u'URL',             u'URL?',
              u'adjective',           u'adverb',      u'count_tweet',
               u'hashtag?',         u'mention?',         u'negation',
                   u'noun',      u'preposition',    u'processedPost',
              u'sentiment',   u'sentiment_subj', u'subjectvity_type',
                   u't_id',       u'topic_name',             u'verb',
              u't_created',       u't_retweets',      u't_favorites',
             u't_is_reply',       u't_is_quote',     u't_n_hashtags',
               u't_n_urls',     u't_n_mentions',        u't_n_media',
                   u'u_id',        u'u_created',       u'u_n_listed',
          u'u_n_favorites',    u'u_n_followers',      u'u_n_friends',
           u'u_n_statuses',    u'u_is_verified',       u'u_location',
                 u'u_name',            u'u_url', u'is_controversial',
                    u'TID',             u'CATS',          u'u_state'],
      dtype='object')



In [6]:

    
df.CATS.fillna(0).apply(
    lambda x: Counter(['UNK']) 
    if x == 0 
    else Counter(x)
).apply(lambda x: len(x)).describe()









    Out[6]:





count    246869.000000
mean          1.139163
std           0.356983
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max           5.000000
Name: CATS, dtype: float64



In [7]:

    
df["CATS_Counter"] = df.CATS.fillna(0).apply(
    lambda x: Counter(['NONE']) 
    if x == 0 
    else Counter(x)
)
df[df.CATS_Counter.apply(lambda x: len(x)) == 2]["CATS_Counter"].head()









    Out[7]:





23     {u'socialmedia': 1, u'videos': 1}
29    {u'twitter': 1, u'socialmedia': 1}
38     {u'socialmedia': 1, u'videos': 1}
53     {u'socialmedia': 1, u'videos': 1}
54    {u'twitter': 1, u'socialmedia': 1}
Name: CATS_Counter, dtype: object

Chorogrid plot



In [8]:

    
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()
df_t



In [9]:

    
mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
mybin = Colorbin(df_t['mean'], mycolors, proportional=True, decimals=None)
mybin.set_decimals(1)
mybin.recalc(fenceposts=True)
mybin.calc_complements(0.5, '#e0e0e0', '#101010')



In [10]:

    
states = list(df_t.u_state)
colors_by_state = mybin.colors_out
font_colors_by_state = mybin.complements
legend_colors = mybin.colors_in
legend_labels = mybin.labels

for lst in ['states', 'colors_by_state', 'font_colors_by_state', 'legend_colors', 'legend_labels']:
    obj = eval(lst)
    print("{:>20}: len {:2}: {}...".format(lst, len(obj), obj[:3]))









    



              states: len 56: ['AK', 'AL', 'AR']...
     colors_by_state: len 56: ['#d8daeb', '#d8daeb', '#d8daeb']...
font_colors_by_state: len 56: ['#101010', '#101010', '#101010']...
       legend_colors: len  6: ['#b35806', '#f1a340', '#fee0b6']...
       legend_labels: len  6: [u'0.0-0.2', u'0.2-0.3', u'0.3-0.5']...



In [11]:

    
cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
cg.set_title('mean', font_dict={'font-size': 19})
cg.set_legend(legend_colors, legend_labels, title='mean')
cg.draw_multihex(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
    # another strategy would be to pass a legend_offset to spacing_dict
cg.done(show=True)









    



WARNING: The following are not recognized ids: set(['PR', 'VI', 'GU', 'AS', 'MP'])

Plot individual topic maps



In [12]:

    
def logit_transform(p):
    eps = 1e-8
    return np.log((p + eps)/(1-p + eps))



In [13]:

    
def plot_map(df, location_col, value_col, text_cols,
            scl="Portland", title="", cbar_title="", decimals=2, value_transform=None):
    #mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
    #mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
    mycolors = ['#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58']
    
    values = df[value_col].astype(float)
    if value_transform:
        values = logit_transform(values)
    mybin = Colorbin(values, mycolors,
                     #proportional=True,
                     decimals=None)
    mybin.set_decimals(decimals)
    mybin.recalc(fenceposts=True)
    mybin.calc_complements(0.5, '#e0e0e0', '#101010')
    
    states = list(df[location_col])
    colors_by_state = mybin.colors_out
    font_colors_by_state = mybin.complements
    legend_colors = mybin.colors_in
    legend_labels = mybin.labels
    
    cg = Chorogrid(
        CHOROGRID_STATES_FILE,
        states, colors_by_state,
    )
    cg.set_title(title, font_dict={'font-size': 19})
    cg.set_legend(legend_colors, legend_labels, title=cbar_title,
                  font_dict={'font-size': '10px', })
    #cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
    cg.draw_multihex(spacing_dict={
        'margin_right': 150,
        'missing_color': '#ffffff',
        'stroke_color': '#000000',
        'stroke_width': 0.1
    }, font_dict={
        'stroke-width': '0.1px',
    }, font_colors=font_colors_by_state)
    cg.done(show=True)



In [14]:

    
df_t = df[df.u_state != "USA"].groupby("u_state")["is_controversial"].agg([np.mean, len, np.std]).reset_index()

plot_map(df_t,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         title="Proportion of controversial tweets per state",
         cbar_title="Proportion",
        )









    



WARNING: The following are not recognized ids: set(['PR', 'VI', 'GU', 'AS', 'MP'])



In [15]:

    
df_t = df.assign(
    fakenews=df.CATS_Counter.apply(lambda x: x.get('fakenews', 0))
)[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()

plot_map(df_t,
         "u_state", "mean", ["u_state","len", "std"], scl='Portland',
         title="Proportion of fakenews urls per state",
         cbar_title="Proportion"
        )









    



WARNING: The following are not recognized ids: set(['PR', 'GU', 'USA', 'VI', 'AS', 'MP'])



In [16]:

    
for url_type in ["fakenews", "news", "blog"]:
    df_t = df[(df.u_state != "USA")
        & (df.t_n_urls > 0)].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
    )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()

    plot_map(df_t[
            #(df_t["len"] > (df_t["len"].sum() * 0.01))
            (df_t["len"] >= (df_t["len"].sort_values().values[-10])) 
            & (~df_t["u_state"].isin(NON_STATES))
        ],
             "u_state", "mean", ["u_state","len", "std"], scl='Portland',
             title="Proportion of %s urls (in tweets with URLs) per state" % url_type.title(),
             cbar_title="Proportion"
            )









    



WARNING: The following ids in the csv are not included: set(['DE', 'DC', 'WI', 'WV', 'HI', 'WY', 'NH', 'NJ', 'NM', 'LA', 'NC', 'ND', 'NE', 'TN', 'RI', 'NV', 'CO', 'AK', 'AL', 'AR', 'VT', 'IN', 'IA', 'MA', 'AZ', 'ID', 'CT', 'ME', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MI', 'KS', 'MT', 'MS', 'SC', 'KY', 'OR', 'SD'])






    











    



WARNING: The following ids in the csv are not included: set(['DE', 'DC', 'WI', 'WV', 'HI', 'WY', 'NH', 'NJ', 'NM', 'LA', 'NC', 'ND', 'NE', 'TN', 'RI', 'NV', 'CO', 'AK', 'AL', 'AR', 'VT', 'IN', 'IA', 'MA', 'AZ', 'ID', 'CT', 'ME', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MI', 'KS', 'MT', 'MS', 'SC', 'KY', 'OR', 'SD'])






    











    



WARNING: The following ids in the csv are not included: set(['DE', 'DC', 'WI', 'WV', 'HI', 'WY', 'NH', 'NJ', 'NM', 'LA', 'NC', 'ND', 'NE', 'TN', 'RI', 'NV', 'CO', 'AK', 'AL', 'AR', 'VT', 'IN', 'IA', 'MA', 'AZ', 'ID', 'CT', 'ME', 'MD', 'OK', 'OH', 'UT', 'MO', 'MN', 'MI', 'KS', 'MT', 'MS', 'SC', 'KY', 'OR', 'SD'])

Split by topics



In [17]:

    
df.topic_name.value_counts()









    Out[17]:





Privacy            73593
Seat Belt          73270
Vaccine            40713
Gun Control        34357
Skin Damage        14128
Child Education    10808
Name: topic_name, dtype: int64



In [18]:

    
def plot_by_topic(df, url_type, nstates=10):
    display(HTML("<h2>{}</h2>".format(url_type.upper())))
    total_population = sum(STATE_POPULATIONS.values())
    for topic in topic_order:
        df_t = df[(df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                 ].assign(
            **{
                url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))
        })[["u_state", url_type]].groupby("u_state")[url_type].agg([np.sum, np.mean, len, np.std]).reset_index()
        df_t["value_rank"] = df_t["mean"].rank(ascending=False)
        df_t = df_t.assign(mean=df_t["sum"] * total_population/df_t["u_state"].apply(
            lambda k: STATE_POPULATIONS.get(k, total_population))) 
        plot_map(df_t[
                #(df_t["len"] > (df_t["len"].sum() * 0.01))
                #(df_t["len"] >= (df_t["len"].sort_values().values[nstates]))
                (~df_t["u_state"].isin(NON_STATES))

            ],
             "u_state", "mean", ["u_state", "value_rank", "mean","len", "std"], scl="Portland",
             title=topic,
             cbar_title="Proportion",
                decimals=3,
        )
        
nstates=None

Fake News Maps



In [19]:

    
url_type = "fakenews"
plot_by_topic(df, url_type, nstates=nstates)









    




FAKENEWS






    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])

Blog Maps



In [20]:

    
url_type = "blog"
plot_by_topic(df, url_type, nstates=nstates)









    




BLOG






    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])

News Maps



In [21]:

    
url_type = "news"
plot_by_topic(df, url_type, nstates=nstates)









    




NEWS






    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])



In [22]:

    
def plot_map_subplots(df, url_type, decimals=2, nstates=10):
    display(HTML("<h2>{}</h2>".format(url_type.upper())))
    data = []
    COLS = 3
    ROWS = 2

    values_states = []
    total_population = sum(STATE_POPULATIONS.values())
    for i, topic in enumerate(topic_order):
        x = i % COLS
        y = i / COLS
        df_t = df[(df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                 ].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
        )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.sum, np.mean, len, np.std]).reset_index()
        df_t = df_t.assign(mean=(df_t["sum"]) / df_t["u_state"].apply(
            lambda k: STATE_POPULATIONS.get(k, total_population)/total_population)
                          ) 


        df_t = df_t[#(df_t["len"] >= (df_t["len"].sort_values().values[-nstates]))
                 (~df_t["u_state"].isin(NON_STATES))
                ]
        values_states.append((
            topic, df_t["mean"].astype(float).values.tolist(),
            df_t["u_state"].values.tolist()
        ))


    #mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
    #mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
    mycolors = ['#ffffd9','#edf8b1','#c7e9b4','#7fcdbb','#41b6c4','#1d91c0','#225ea8','#253494','#081d58']
    mybin = Colorbin(
        sum(map(lambda x: x[1], values_states), []),
        mycolors,
        proportional=True,
        decimals=None
    )
    mybin.set_decimals(3)
    mybin.recalc(fenceposts=True)
    mybin.calc_complements(0.5, '#e0e0e0', '#101010')

    colors_by_state_all = mybin.colors_out
    font_colors_by_state_all = mybin.complements
    legend_colors = mybin.colors_in
    legend_labels = mybin.labels

    curr_idx = 0
    for i, topic in enumerate(topic_order):
        states = values_states[i][2]
        colors_by_state = colors_by_state_all[curr_idx:curr_idx+len(states)]
        font_colors_by_state = font_colors_by_state_all[curr_idx:curr_idx+len(states)]
        curr_idx += len(states)
        cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
        cg.set_title(topic, font_dict={'font-size': 19})
        cg.set_legend(legend_colors, legend_labels, title="Proportion",
                  font_dict={'font-size': '10px', })
        #cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
        cg.draw_multihex(spacing_dict={
            'margin_right': 150,
            'missing_color': '#ffffff',
            'stroke_color': '#000000',
            'stroke_width': 0.1
        }, font_dict={
            'stroke-width': '0.1px',
        }, font_colors=font_colors_by_state)
        cg.done(show=True)



In [23]:

    
plot_map_subplots(df, url_type="fakenews", decimals=2, nstates=nstates)









    




FAKENEWS






    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])



In [24]:

    
plot_map_subplots(df, url_type="blog", decimals=2, nstates=nstates)









    




BLOG






    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])



In [25]:

    
plot_map_subplots(df, url_type="news", decimals=2, nstates=nstates)









    




NEWS






    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])






    











    



WARNING: The following ids in the csv are not included: set(['DC'])

Show ratio in each state



In [26]:

    
def plot_map_subplots(df, url_type, decimals=2):
    display(HTML("<h2>{}</h2>".format(url_type.upper())))
    data = []
    COLS = 3
    ROWS = 2

    values_states = []

    for i, topic in enumerate(topic_order):
        x = i % COLS
        y = i / COLS
        df_t = df[(df.u_state != "USA") 
                  & (df.t_n_urls > 0)
                  & (df.topic_name == topic)
                 ].assign(**{
        url_type: lambda x: x.CATS_Counter.apply(lambda k: k.get(url_type, 0))}
        )[["u_state", url_type]].groupby("u_state")[url_type].agg([np.mean, len, np.std]).reset_index()

        df_t = df_t[(df_t["len"] >= (df_t["len"].sort_values().values[-10]))
                 & (~df_t["u_state"].isin(NON_STATES))
                ]
        values_states.append((
            topic, df_t["mean"].astype(float).values.tolist(),
            df_t["u_state"].values.tolist()
        ))


    #mycolors = ['#b35806', '#f1a340', '#fee0b6', '#d8daeb', '#998ec3', '#542788']
    mycolors = ['#b35806','#e08214','#fdb863','#fee0b6','#f7f7f7','#d8daeb','#b2abd2','#8073ac','#542788']
    mybin = Colorbin(
        sum(map(lambda x: x[1], values_states), []),
        mycolors,
        proportional=True,
        decimals=None
    )
    mybin.set_decimals(3)
    mybin.recalc(fenceposts=True)
    mybin.calc_complements(0.5, '#e0e0e0', '#101010')

    colors_by_state_all = mybin.colors_out
    font_colors_by_state_all = mybin.complements
    legend_colors = mybin.colors_in
    legend_labels = mybin.labels

    curr_idx = 0
    for i, topic in enumerate(topic_order):
        states = values_states[i][2]
        colors_by_state = colors_by_state_all[curr_idx:curr_idx+len(states)]
        font_colors_by_state = font_colors_by_state_all[curr_idx:curr_idx+len(states)]
        curr_idx += len(states)
        cg = Chorogrid(CHOROGRID_STATES_FILE, states, colors_by_state)
        cg.set_title(topic, font_dict={'font-size': 19})
        cg.set_legend(legend_colors, legend_labels, title="Proportion",
                  font_dict={'font-size': '10px', })
        #cg.draw_squares(spacing_dict={'margin_right': 150}) # otherwise legend will be cut off
        cg.draw_multihex(spacing_dict={
            'margin_right': 150,
            'missing_color': '#ffffff',
            'stroke_color': '#000000',
            'stroke_width': 0.1
        }, font_dict={
            'stroke-width': '0.1px',
        }, font_colors=font_colors_by_state)
        cg.done(show=True)

Analysis



In [27]:

    
df_topics = {}
for topic in topic_order:
    df_t = df[(df.u_state != "USA") 
              & (df.t_n_urls > 0)
              & (df.topic_name == topic)
             ].assign(
    fakenews=lambda x: x.CATS_Counter.apply(lambda k: k.get('fakenews', 0))
    )[["u_state", "fakenews"]].groupby("u_state")["fakenews"].agg([np.mean, len, np.std]).reset_index()
    df_t["value_rank"] = df_t["mean"].rank(ascending=False)
    df_topics[topic] = (df_t[
            (df_t["len"] >= (df_t["len"].sort_values().values[-10]))
            #(df_t["len"] > (df_t["len"].sum() * 0.01)
        ].sort_values("mean",
                                          ascending=False).reset_index().apply(
            lambda x: "%s (%.2f) [%s]" % (
                x["u_state"], x["mean"], x["len"]), axis=1))
pd.concat(df_topics, axis=1, keys=topic_order)









    Out[27]:







  
    
      
      Gun Control
      Privacy
      Vaccine
      Child Education
      Skin Damage
      Seat Belt
    
  
  
    
      0
      VA (0.18) [330]
      FL (0.07) [1252]
      FL (0.15) [745]
      DC (0.01) [154]
      IL (0.01) [236]
      TX (0.01) [759]
    
    
      1
      FL (0.18) [707]
      IL (0.07) [742]
      OH (0.14) [413]
      CA (0.01) [627]
      OH (0.01) [183]
      OH (0.01) [257]
    
    
      2
      TX (0.18) [938]
      DC (0.06) [1846]
      TX (0.13) [978]
      IL (0.01) [173]
      CA (0.00) [871]
      CA (0.01) [1404]
    
    
      3
      GA (0.13) [339]
      PA (0.06) [594]
      GA (0.08) [436]
      NY (0.00) [402]
      AZ (0.00) [236]
      GA (0.01) [356]
    
    
      4
      PA (0.12) [312]
      TX (0.06) [1526]
      CA (0.08) [3507]
      FL (0.00) [258]
      FL (0.00) [489]
      FL (0.01) [661]
    
    
      5
      IL (0.12) [421]
      NY (0.05) [2563]
      NY (0.08) [1660]
      GA (0.00) [138]
      GA (0.00) [275]
      PA (0.01) [346]
    
    
      6
      DC (0.12) [429]
      CA (0.05) [3165]
      PA (0.07) [432]
      MO (0.00) [127]
      NY (0.00) [636]
      NY (0.01) [965]
    
    
      7
      CA (0.11) [1530]
      VA (0.04) [715]
      DC (0.06) [385]
      NJ (0.00) [123]
      TX (0.00) [451]
      IL (0.01) [329]
    
    
      8
      NY (0.09) [1154]
      MA (0.04) [607]
      MA (0.06) [447]
      PA (0.00) [142]
      NC (0.00) [211]
      WA (0.00) [305]
    
    
      9
      WA (0.06) [394]
      WA (0.03) [1050]
      WA (0.06) [736]
      TX (0.00) [271]
      PA (0.00) [216]
      MI (0.00) [258]



In [28]:

    
fig, ax = plt.subplots(1,1,figsize=(15,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    errwidth=2,
                data=df[~df.u_state.isin(NON_STATES)].sort_values("u_state"),
               ax=ax, color="0.7")
    ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Proportion of controversial tweets")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    sns.despine(offset=10)









    



/homed/content/anaconda3/envs/python2/lib/python2.7/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))



In [29]:

    
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    errwidth=2,
                data=df.assign(u_state=df.u_state.fillna("UNK")),
               ax=ax, color="r", order=LOCATION_ORDER)
    ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Proportion of controversial tweets")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    [ax.patches[i].set_color(c) for i, c in enumerate(colors)]
    sns.despine(offset=10)
    plt.setp(ax.get_xticklabels()[:3], rotation=90)



In [30]:

    
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["k"]*6
total_controversial = df[(df.is_controversial == 1) & (~df.u_state.isin(NON_STATES))].shape[0] * 1.
fig, ax = plt.subplots(1,1,figsize=(16,5))
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.barplot(y="is_controversial", x="u_state",
                    
                data=df[
            (df.is_controversial == 1)
            & (~df.u_state.isin(NON_STATES))
        ],
               ax=ax, color="0.5",
                    order=LOCATION_ORDER[2:-6],
                    ci=None, estimator=lambda x: len(x)/total_controversial)
    #ax.axhline(y=0.5, linestyle='--', color="k", lw=1.)
    ax.set_ylabel("Distribution of controversial tweets\nacross states")
    ax.set_xlabel("US States")
    #ax.tick_params(axis='x', which='major', labelsize=10)
    #[ax.patches[i].set_color(c) for i, c in enumerate(colors)]
    sns.despine(offset=10)
    #plt.setp(ax.get_xticklabels()[:3], rotation=90)



In [31]:

    
df_t = df[(~df.u_state.isin(NON_STATES)) & (~df.u_state.isnull())].pivot_table(
    index="u_state", columns="topic_name", values="t_id", aggfunc=len)
with sns.plotting_context(
    rc={"axes.titlesize": 10,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "ytick.labelsize": 10,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    g = sns.PairGrid(df_t.divide(df_t.sum(axis=0), axis=1).reset_index(),
                     x_vars=topic_order, y_vars=["u_state"],
                     size=10, aspect=.25)
    g.map(sns.stripplot, size=10, orient="h",
          color="k", edgecolor="gray")

    # Use the same x axis limits on all columns and add better labels
    g.set(xlabel="proportion", ylabel="",)

    # Use semantically meaningful titles for the columns
    titles = topic_order

    for ax, title in zip(g.axes.flat, titles):

        # Set a different title for each axes
        ax.set(title=title)

        # Make the grid horizontal instead of vertical
        ax.xaxis.grid(False)
        ax.yaxis.grid(True)

    sns.despine(left=True, bottom=True)

# Draw a dot plot using the stripplot function



In [32]:

    
LOCATION_ORDER = (["UNK", "USA"] + sorted(set(
            df.u_state.fillna("UNK").value_counts().index
    ) - NON_STATES)+ sorted(["AS", "DC", "GU",
              "MP", "PR", "VI"]))
colors = ["b"] * 2 + ["r"]*50 +["0.7"]*6
with sns.plotting_context(
    rc={"axes.titlesize": 14,
        "axes.labelsize": 14,
        "xtick.labelsize": 12,
        "ytick.labelsize": 14,
       }), sns.axes_style(
    rc={"font.family": "monospace"}):
    fig, ax = plt.subplots(1,1, figsize=(20,5))
    ax = sns.countplot(df.u_state.fillna("UNK"), color='k', ax=ax, 
                      order=LOCATION_ORDER)
    ax.set_yscale('log')
    ax.set_ylabel('Frequency')
    ax.set_xlabel('Tweet author location')
    plt.xticks(rotation='vertical')
    #sns.despine(offset=2)
    [ax.patches[i].set_color(c) for i, c in enumerate(colors)]



In [33]:

    
pd.concat([pd.DataFrame(k.reset_index().values, columns=["Location", "Counts"])
    for k in np.array_split(df.u_state.fillna("UNK").value_counts(), 4, axis=0)], axis=1)



In [34]:

    
df.u_state.describe()









    Out[34]:





count     169038
unique        57
top           CA
freq       22123
Name: u_state, dtype: object



In [35]:

    
df.u_state.shape









    Out[35]:





(246869,)



In [36]:

    
df.groupby("u_id")["u_state"].first().shape, df.groupby("u_id")["u_state"].first().describe()









    Out[36]:





((151073,), count     107970
 unique        57
 top           CA
 freq       13251
 Name: u_state, dtype: object)



In [ ]:

	u_state	mean	len	std
0	AK	0.618557	291	0.486578
1	AL	0.587302	1449	0.492489
2	AR	0.634062	869	0.481970
3	AS	0.300000	10	0.483046
4	AZ	0.573411	3242	0.494658
5	CA	0.654161	22123	0.475652
6	CO	0.617085	2669	0.486189
7	CT	0.594237	1284	0.491230
8	DC	0.822330	5150	0.382272
9	DE	0.543796	274	0.498990
10	FL	0.599461	8913	0.490035
11	GA	0.549006	4377	0.497649
12	GU	0.000000	5	0.000000
13	HI	0.555777	502	0.497375
14	IA	0.562099	934	0.496395
15	ID	0.611529	399	0.488015
16	IL	0.567998	5331	0.495401
17	IN	0.526071	3222	0.499397
18	KS	0.394879	1484	0.488989
19	KY	0.572368	1368	0.494916
20	LA	0.468478	1951	0.499133
21	MA	0.632763	3913	0.482114
22	MD	0.556561	2431	0.496893
23	ME	0.650817	673	0.477066
24	MI	0.574339	3141	0.494522
25	MN	0.572537	1675	0.494858
26	MO	0.564935	2002	0.495889
27	MP	1.000000	2	0.000000
28	MS	0.567430	786	0.495748
29	MT	0.674740	289	0.469284
30	NC	0.612668	3568	0.487209
31	ND	0.456250	160	0.499646
32	NE	0.458272	683	0.498621
33	NH	0.682051	585	0.466078
34	NJ	0.589065	3402	0.492076
35	NM	0.636574	432	0.481544
36	NV	0.588895	1693	0.492180
37	NY	0.656886	14689	0.474765
38	OH	0.580961	4601	0.493455
39	OK	0.555787	1443	0.497050
40	OR	0.651773	2171	0.476518
41	PA	0.606705	4653	0.488534
42	PR	0.625000	24	0.494535
43	RI	0.600000	455	0.490437
44	SC	0.573099	1539	0.494788
45	SD	0.405063	237	0.491943
46	TN	0.514493	2622	0.499885
47	TX	0.569604	11666	0.495153
48	UT	0.482531	1059	0.499931
49	VA	0.639094	3796	0.480327
50	VI	0.000000	5	0.000000
51	VT	0.715827	278	0.451833
52	WA	0.701355	5093	0.457709
53	WI	0.624379	1813	0.484416
54	WV	0.556923	325	0.497515
55	WY	0.606936	173	0.489849

	Gun Control	Privacy	Vaccine	Child Education	Skin Damage	Seat Belt
0	VA (0.18) [330]	FL (0.07) [1252]	FL (0.15) [745]	DC (0.01) [154]	IL (0.01) [236]	TX (0.01) [759]
1	FL (0.18) [707]	IL (0.07) [742]	OH (0.14) [413]	CA (0.01) [627]	OH (0.01) [183]	OH (0.01) [257]
2	TX (0.18) [938]	DC (0.06) [1846]	TX (0.13) [978]	IL (0.01) [173]	CA (0.00) [871]	CA (0.01) [1404]
3	GA (0.13) [339]	PA (0.06) [594]	GA (0.08) [436]	NY (0.00) [402]	AZ (0.00) [236]	GA (0.01) [356]
4	PA (0.12) [312]	TX (0.06) [1526]	CA (0.08) [3507]	FL (0.00) [258]	FL (0.00) [489]	FL (0.01) [661]
5	IL (0.12) [421]	NY (0.05) [2563]	NY (0.08) [1660]	GA (0.00) [138]	GA (0.00) [275]	PA (0.01) [346]
6	DC (0.12) [429]	CA (0.05) [3165]	PA (0.07) [432]	MO (0.00) [127]	NY (0.00) [636]	NY (0.01) [965]
7	CA (0.11) [1530]	VA (0.04) [715]	DC (0.06) [385]	NJ (0.00) [123]	TX (0.00) [451]	IL (0.01) [329]
8	NY (0.09) [1154]	MA (0.04) [607]	MA (0.06) [447]	PA (0.00) [142]	NC (0.00) [211]	WA (0.00) [305]
9	WA (0.06) [394]	WA (0.03) [1050]	WA (0.06) [736]	TX (0.00) [271]	PA (0.00) [216]	MI (0.00) [258]

	Location	Counts	Location	Counts	Location	Counts	Location	Counts
0	UNK	77831	NJ	3402	AL	1449	ID	399
1	CA	22123	AZ	3242	OK	1443	WV	325
2	USA	21114	IN	3222	KY	1368	AK	291
3	NY	14689	MI	3141	CT	1284	MT	289
4	TX	11666	CO	2669	UT	1059	VT	278
5	FL	8913	TN	2622	IA	934	DE	274
6	IL	5331	MD	2431	AR	869	SD	237
7	DC	5150	OR	2171	MS	786	WY	173
8	WA	5093	MO	2002	NE	683	ND	160
9	PA	4653	LA	1951	ME	673	PR	24
10	OH	4601	WI	1813	NH	585	AS	10
11	GA	4377	NV	1693	HI	502	GU	5
12	MA	3913	MN	1675	RI	455	VI	5
13	VA	3796	SC	1539	NM	432	MP	2
14	NC	3568	KS	1484	NaN	NaN	NaN	NaN

Chorogrid plot

Plot individual topic maps

Split by topics

Fake News Maps

FAKENEWS

Blog Maps

BLOG

News Maps

NEWS

Plots which share the colorbar

FAKENEWS

BLOG

NEWS

Show ratio in each state

Analysis